Example #1
0
def test_get_destination_age_number():
    src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
    reader = PdfFileReader(src)
    outlines = reader.getOutlines()
    for outline in outlines:
        if not isinstance(outline, list):
            reader.getDestinationPageNumber(outline)
Example #2
0
 def __choose(self, reader: PdfFileReader, outlines: list, idx: int):
     current_outline = outlines[idx]
     pages = self.__chapter_pages(reader, outlines, idx)
     current_page = reader.getDestinationPageNumber(current_outline)
     for page in range(current_page, current_page + pages):
         self._writer.addPage(reader.getPage(page))
         self._written_pages += 1
Example #3
0
    def __chapter_pages(self, reader: PdfFileReader, outlines: list,
                        idx: int) -> int:
        current_outline = outlines[idx]
        current_page = reader.getDestinationPageNumber(current_outline)
        for idx_ in range(idx + 1, len(outlines)):
            next_outline = outlines[idx_]
            if isinstance(next_outline, Destination):
                return reader.getDestinationPageNumber(
                    next_outline) - current_page

        current_level, *_ = self.state_list[-1]
        if current_level != START_LEVEL:
            pages_to_upper_chapter = self.__pages_to_next_upper_chapter(
                reader, current_page, current_level)
            if pages_to_upper_chapter > 0:
                return pages_to_upper_chapter

        return reader.numPages - current_page
Example #4
0
def split_by_sections(path):
    """按照pdf书签结构拆分pdf文件,目前只支持第一级目录拆分

    Args:
        path (str): pdf文件路径
    """

    # 获取文件名(不包含路径和后缀),以便作为拆分后的文件名的基础
    filename = os.path.splitext(os.path.basename(path))[0]

    pdf_reader = PdfFileReader(path)
    outlines = pdf_reader.outlines
    # 存储各个section信息,包括标题、起始页、结束页
    sections = []
    titles = []
    start_pages = []
    end_pages = []

    for outline in outlines:
        titles.append(outline['/Title'])
        start_pages.append(pdf_reader.getDestinationPageNumber(outline) + 1)
        # 结束页码取下一个section的起始页码-1
        # end_pages数组比其他数组多一个元素
        # 虽然此处是本section的起始页码-1,实际上从数组取值是从第二个开始取值的
        # 因此起始存储的是下一个section的起始页码-1
        last_section_page = pdf_reader.getDestinationPageNumber(outline)
        end_pages.append(last_section_page)
    end_pages.append(pdf_reader.numPages)
    for i in range(len(outlines)):
        # 将section信息存入数组
        section = [titles[i], start_pages[i], end_pages[i + 1]]
        sections.append(section)

    for idx, section in enumerate(sections):
        title = section[0]
        pdf_writer = PdfFileWriter()
        # 每个section分别存储到独立的pdf
        for i in range(section[2] - section[1] + 1):
            pdf_writer.addPage(pdf_reader.getPage(section[1] + i - 1))
        output_filename = f'{filename}-{idx + 1}-{title}.pdf'
        with open(output_filename, 'wb') as out:
            pdf_writer.write(out)
Example #5
0
 def __pages_to_next_upper_chapter(self, reader: PdfFileReader,
                                   current_page: int,
                                   current_level: int) -> int:
     for state in reversed(self.state_list):
         previous_level, previous_outlines, previous_idx = state
         if previous_level < current_level:
             for outline in previous_outlines[(previous_idx + 1):]:
                 if isinstance(outline, Destination):
                     chapter_pages = reader.getDestinationPageNumber(
                         outline) - current_page
                     if chapter_pages > 0:
                         return chapter_pages
     return 0
file_stream = open(file_to_read, 'rb')
pdf_content = PdfFileReader(file_stream)
outlines = pdf_content.getOutlines()

for i, item in enumerate(outlines):
    if type(item) is generic.Destination and type(outlines[i + 1]) is list:
        title = item.title
        title = '_'.join(title.strip().replace('/', '_').split(' '))
        max_number_of_characters = 100
        if len(title) > max_number_of_characters:
            title = title[:max_number_of_characters]
        outlines[i + 1].insert(0, item)
        content = outlines[i + 1]
        chapters.append((title, content))

for chapter in chapters:

    subchapters = flatten(chapter[1])
    file_to_write = dir_to_save_chapters / f'{chapter[0]}.pdf'

    pdf_writer = PdfFileWriter()
    start_page = pdf_content.getDestinationPageNumber(subchapters[0])
    end_page = pdf_content.getDestinationPageNumber(subchapters[-1])

    for i in range(start_page, end_page + 1):
        pdf_writer.addPage(pdf_content.getPage(i))
    with open(file_to_write, 'wb') as f:
        pdf_writer.write(f)

file_stream.close()
with open(srcfile, "rb") as f:
    pdf = PdfFileReader(f)
    #Try bookmarks without child
    try:
        bookmarks = pdf.getOutlines()
    except:
        upload=False
        errormsg= "this file contains bookmarks with child"
        error_log(filename,upload,errormsg)
        sys.exit()
    #Read Bookmarks
    if bookmarks:
        for b in bookmarks:
            invID = b['/Title']
            if len(invID) < 22 and re.match('\w',invID):
                i = pdf.getDestinationPageNumber(b)
                #Search InvID in database
                #Connect to db
                db = client.iportalDevDB19
                #Connect to collection
                collection = db.investors
                collection2 = db.fundinvestors
                rinvID = ''
                fundID = ''
                for y in collection2.find({ "invID":  invID }):
                    fundID=  str(y['fundID'])
                    print (fundID)
                if fundID:
                    for x in collection.find({ "invID":  invID }):
                        rinvID=  str(x['_id'])
 
Example #8
0
 def __is_the_end(self, reader: PdfFileReader, outlines: list,
                  idx: int) -> bool:
     left_pages = reader.numPages - reader.getDestinationPageNumber(
         outlines[idx])
     return self.__chapter_pages(reader, outlines, idx) == left_pages