class PDFReader:
    def __init__(self, filename):
        self.filename = filename
        self.file = open(self.filename + ".pdf", 'rb')
        self.pdf = PdfFileReader(self.file)

    def printBookDetails(self):
        print("Details of the book")
        print("Number of pages:", self.pdf.getNumPages())
        print("Title:", self.pdf.getDocumentInfo().title)
        print("Author:", self.pdf.getDocumentInfo().author)

    def printPage(self, pageNo):
        print("Reading Page ", pageNo)
        page = self.pdf.getPage(pageNo)
        print(page.extractText())

    def printOutline(self):
        print("Book Outline")
        for heading in self.pdf.getOutlines():
            if type(heading) is not list:
                print(dict(heading).get('/Title'))
    def add_watermark_to_pdf(self, src, dst, water):
        file_src = open(water, 'rb')
        water_reader = PdfFileReader(file_src)
        water_page = water_reader.getPage(0)

        source_reader = PdfFileReader(open(src, 'rb'))
        dest_write = PdfFileWriter()
        for pageNum in range(0, source_reader.numPages):
            self.progress_changed.emit(pageNum + 1, source_reader.numPages)
            pdf_page = source_reader.getPage(pageNum)
            pdf_page.mergePage(water_page)
            dest_write.addPage(pdf_page)

        org_info = water_reader.getDocumentInfo()
        infos = {}
        for k in org_info:
            infos[k] = org_info[k]
            print(k, org_info[k])

        infos['/Producer'] = 'LiuShengKun'
        infos['/Title'] = os.path.basename(src)
        dest_write.addMetadata(infos)
        outlines = source_reader.getOutlines()
        self._add_bookmark(dest_write, outlines)
        with open(dst, 'wb') as f:
            dest_write.write(f)

        file_src.close()
        self.progress_changed.emit(source_reader.numPages + 1, source_reader.numPages)
Beispiel #3
0
def page_extract():

    PDF_IN = PdfFileReader(open(PDF_DIR, 'rb'))
    
    pg_id_num_map = page_id_to_num(PDF_IN)
    outlines = PDF_IN.getOutlines()
    bmrks = bookmarks(outlines, pg_id_num_map)

    png_list = []

    for i in range(int(start) - 1, int(end)):

        output = PdfFileWriter()
        output.addPage(PDF_IN.getPage(i))
        
        base, name_ext = os.path.split(PDF_DIR)
        name, ext      = os.path.splitext(name_ext)
        PDF_OUT        = '{}{}'.format(TMP_DIR, '{}-{}{}'.format(name, str(i).zfill(6), ext))
        
        with open(PDF_OUT, 'wb') as outputStream:
            output.write(outputStream)
        
        png_list.append(gs_pdf_to_png(PDF_OUT))
        png_list.append(closest(bmrks, i+1))
        os.remove(PDF_OUT)
    

    png_list = group(png_list, 4)
    for tup in png_list:
        make_cards(tup[0], tup[2], tup[3])
        print "Current Tag Processed: " + tup[3]
Beispiel #4
0
def test_get_destination_age_number():
    src = os.path.join(RESOURCE_ROOT, "pdflatex-outline.pdf")
    reader = PdfFileReader(src)
    outlines = reader.getOutlines()
    for outline in outlines:
        if not isinstance(outline, list):
            reader.getDestinationPageNumber(outline)
Beispiel #5
0
def bookmarks(PDF):
    
    def page_id_to_num(pdf, pages=None, _result=None, _num_pages=None):

        if _result is None:
            _result = {}
        
        if pages is None:
            _num_pages = []
            pages = pdf.trailer["/Root"].getObject()["/Pages"].getObject()


        t = pages["/Type"]

        if t == "/Pages":
            for page in pages["/Kids"]:
                _result[page.idnum] = len(_num_pages)
                page_id_to_num(pdf, page.getObject(), _result, _num_pages)

        elif t == "/Page":
            _num_pages.append(1)

        return _result
    

    def bookmarks(outlines, pg_id_num_map, result=None):

        if result is None:
            result = []

        if type(outlines) == list:
            for outline in outlines:
                result = bookmarks(outline, pg_id_num_map, result)

        elif type(outlines) == PyPDF2.pdf.Destination:
            result.append((pg_id_num_map[outlines.page.idnum]+1, outlines['/Title']))

        return result
    

    PDF_IN = PdfFileReader(open(PDF, 'rb'))
    pg_id_num_map = page_id_to_num(PDF_IN)

    outlines = PDF_IN.getOutlines()
    outlines = [item for item in outlines if not type(item) == list]
    outlines = [item for item in outlines if not item['/Title'] in exclude]

    bmrks = bookmarks(outlines, pg_id_num_map)
    it = iter(bmrks[1:])

    TOC = []
    for x in bmrks:
        try:
            TOC.append( (x[0], (next(it)[0] - 1), x[1]) )
        except:
            pass
    return TOC
Beispiel #6
0
def extract_bookmarks(pdf_filename):
    file = open(pdf_filename, 'rb')
    pdf = PdfFileReader(file)

    map_ = _construct_page_id_to_page_number_map(pdf)
    outlines = pdf.getOutlines()
    list_ = []

    _recursive_extract_bookmarks(outlines, map_, list_)

    file.close()

    return list_
Beispiel #7
0
def get_page_numbers(pdf_name):
    with open(pdf_name, "rb") as f:
        pdf = PdfFileReader(f)
        total_pages = pdf.numPages
        # map page ids to page numbers
        pg_id_num_map = _setup_page_id_to_num(pdf)
        outlines = pdf.getOutlines()
        bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)

    #  print(pg_id_num_map)
    #  print(bookmarks_info)
    pages = {meta["title"]: meta["page"] for meta in bookmarks_info.values()}

    return pages, total_pages
Beispiel #8
0
def getPdfOutlines(pdfpath, listpath, isPage):
    with open(pdfpath, "rb") as file:
        doc = PdfFileReader(file)
        outlines = doc.getOutlines()  # 获取大纲
        global returnlist  # 全局变量,保存大纲的列表
        returnlist = []  # 创建一个空列表
        mylist = getOutline(outlines, isPage)  # 递归获取大纲
        w = DispatchEx("Word.Application")  # 创建Word文档应用程序对象
        w.Visible = 1
        w.DisplayAlerts = 0
        doc1 = w.Documents.Add()  # 添加一个Word文档对象
        range1 = doc1.Range(0, 0)
        for item in mylist:  # 通过循环将获取的目录列表插入到Word文档对象中
            range1.InsertAfter(item)
        outpath = os.path.join(listpath, 'list.docx')  # 连接Word文档路径

        doc1.SaveAs(outpath)  # 保存文件
        doc1.Close()  # 关闭Word文档对象
        w.Quit()  # 退出Word文档应用程序对象
    return outpath
Beispiel #9
0
def getPdfOutlines(pdfpath,listpath,isList):
    '''获取pdf文档的大纲'''
    with open(pdfpath,'rb') as file:
        doc=PdfFileReader(file)
        outlines=doc.getOutlines()
        global returnlist
        returnList=[]
        mylist=getOutline(outlines,isList)
        w=DispatchEx('Word.Application')
        w.Visible=1
        w.DisplayAlerts=0
        doc1=w.Documents.Add()
        range1=doc1.Range(0,0)
        for item in mylist:
            range1.InsertAfter(item)
        outpath=os.path.join(listpath,'list.docx')
        doc1.SaveAs(outpath)
        doc1.close()
        w.Quit()
    return outpath
def merge_pdf_template(src_pdf_path, template_pdf_path, dst_pdf_path):
    try:
        template = PdfFileReader(template_pdf_path, strict=False)
        if template.getNumPages() < 2:
            print template_pdf_path, 'page num must >=2, page 0 for cover page 1 for watermark and header footer!!!!'
            return
        cover_page = template.getPage(0)
        watermark_page = template.getPage(1)
        pdf_reader = PdfFileReader(src_pdf_path, strict=False)
        #print pdf_reader.getDocumentInfo()
        #print pdf_reader.getNamedDestinations()
        pdf_outlines = pdf_reader.getOutlines()
        #analyze_outline(pdf_reader, pdf_outlines)

        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(cover_page)
        if 1:
            #############################################################
            pdf_writer.appendPagesFromReader(pdf_reader)
            #pdf_writer.cloneDocumentFromReader(pdf_reader)
            #pdf_writer.cloneReaderDocumentRoot(pdf_reader)
            for page_index in range(pdf_reader.getNumPages()):
                pdf_page = pdf_writer.getPage(page_index)
                pdf_page.mergePage(watermark_page)
        else:
            for page_index in range(pdf_reader.getNumPages()):
                pdf_page = pdf_reader.getPage(page_index)
                pdf_page.mergePage(watermark_page)
                pdf_writer.addPage(pdf_page)
            #############################################################

        write_outline(pdf_reader, pdf_writer, pdf_outlines, None)
        pdfOutputFile = open(dst_pdf_path, 'wb')
        #pdf_writer.encrypt('qg2101')#设置pdf密码

        pdf_writer.write(pdfOutputFile)
        pdfOutputFile.close()
    except Exception, err:
        #print err
        print str(err).decode("string_escape")
Beispiel #11
0
    def test_investment_report_pdf(self):
        # TODO create example data for everything - after lunch
        market = Market.objects.create(name='test')
        sector = Sector.objects.create(name='test')

        sample_content = '# Lorem Ipsum\n\nHello'

        FrontPage.objects.create(sector=sector)
        SectorOverview.objects.create(sector=sector, content=sample_content)
        KillerFacts.objects.create(sector=sector, content=sample_content)
        MacroContextBetweenCountries.objects.create(market=market,
                                                    content=sample_content)
        UKMarketOverview.objects.create()
        SectorInitiatives.objects.create(sector=sector, content=sample_content)
        LastPage.objects.create(content=sample_content)

        pdf_io = investment_report_pdf_generator(market,
                                                 sector,
                                                 'Test',
                                                 moderated=False)
        reader = PdfFileReader(pdf_io)
        # Nothing else one can really do other than visual
        # inspection of the PDF
        self.assertEquals(reader.getOutlines()[0]['/Title'], 'Contents')
def splitAccordingToBookmarks(PDF, basePDFName):
    with open(PDF, 'rb') as f:
        p = PdfFileReader(f)
        pg_id_num_map = _setup_page_id_to_num(p)
        o = p.getOutlines()
        # type(o[0])
        splitPages = [pg_id_num_map[o[i].page.idnum] for i in range(len(o))]
        bookmarkSet = set(splitPages)
        output = PdfFileWriter()
        count = 0
        for i in range(p.numPages):
            if (i in bookmarkSet):
                print('Found ' + str(i) + ' in s')
                with open(str(basePDFName) + str(count) + ".pdf", "wb") as f2:
                    output.write(f2)
                count += 1
                output = PdfFileWriter()
                output.addPage(p.getPage(i))
            else:
                print('Just added ' + str(i) + ' to o/p')
                output.addPage(p.getPage(i))
        with open(str(basePDFName) + str(count) + ".pdf", "wb") as f2:
            output.write(f2)
        return len(o)
srcfile = os.path.join(original_folder, filename)

# Try file existe?
try:
    filepath = os.stat(srcfile)
except:
    upload=False
    errormsg= "file does not exist"
    error_log(filename,upload,errormsg)
    sys.exit()

with open(srcfile, "rb") as f:
    pdf = PdfFileReader(f)
    #Try bookmarks without child
    try:
        bookmarks = pdf.getOutlines()
    except:
        upload=False
        errormsg= "this file contains bookmarks with child"
        error_log(filename,upload,errormsg)
        sys.exit()
    #Read Bookmarks
    if bookmarks:
        for b in bookmarks:
            invID = b['/Title']
            if len(invID) < 22 and re.match('\w',invID):
                i = pdf.getDestinationPageNumber(b)
                #Search InvID in database
                #Connect to db
                db = client.iportalDevDB19
                #Connect to collection
import PyPDF2
from PyPDF2 import PdfFileReader

pdf = open ("example.pdf", 'rb')
reader = PdfFileReader(pdf)


print "PDF Reader object is:", reader
print "Number of pages:", reader.getNumPages()
print "Title:          ", reader.getDocumentInfo().title
print "Author:         ", reader.getDocumentInfo().author

print "Book Outline"
for heading in reader.getOutlines():
    if type(heading) is not list:
        print dict(heading).get('/Title')

Beispiel #15
0
    def __call__(self, value, system):
        """
        Implements a subclass of pyramid_oereb.core.renderer.extract.json_.Renderer to create a print result
        out of a json. The json extract is reformatted to fit the structure of mapfish print.

        Args:
            value (tuple): A tuple containing the generated extract record and the params
                dictionary.
            system (dict): The available system properties.

        Returns:
            buffer: The pdf content as received from configured mapfish print instance url.
        """
        log.debug("Parameter webservice is {}".format(value[1]))

        if value[1].images:
            raise HTTPBadRequest('With image is not allowed in the print')

        self._request = self.get_request(system)

        # Create a lower case GET dict to be able to accept all cases of upper and lower case writing
        self._lowercase_GET_dict = dict(
            (k.lower(), v.lower()) for k, v in self._request.GET.items())

        # If a language is specified in the request, use it. Otherwise, use the language from base class
        self._fallback_language = Config.get('default_language')
        if 'lang' in self._lowercase_GET_dict:
            self._language = self._lowercase_GET_dict.get('lang')

        self._static_error_message = Config.get('static_error_message').get(self._language) or \
            Config.get('static_error_message').get(self._fallback_language)

        # Based on extract record and webservice parameter, render the extract data as JSON
        extract_record = value[0]
        extract_as_dict = self._render(extract_record, value[1])
        feature_geometry = mapping(extract_record.real_estate.limit)

        if Config.get('print', {}).get('compute_toc_pages', False):
            extract_as_dict['nbTocPages'] = TocPages(
                extract_as_dict).getNbPages()
        else:
            extract_as_dict['nbTocPages'] = 1

        self.convert_to_printable_extract(extract_as_dict, feature_geometry)

        print_config = Config.get('print', {})

        extract_as_dict[
            'Display_RealEstate_SubunitOfLandRegister'] = print_config.get(
                'display_real_estate_subunit_of_land_register', True)

        extract_as_dict['Display_Certification'] = print_config.get(
            'display_certification', False)

        spec = {
            'layout': Config.get('print', {})['template_name'],
            'outputFormat': 'pdf',
            'lang': self._language,
            'attributes': extract_as_dict,
        }

        response = self.get_response(system)

        if self._request.GET.get('getspec', 'no') != 'no':
            response.headers[
                'Content-Type'] = 'application/json; charset=UTF-8'
            return json.dumps(spec, sort_keys=True, indent=4)
        pdf_url = urlparse.urljoin(
            Config.get('print', {})['base_url'] + '/', 'buildreport.pdf')
        pdf_headers = Config.get('print', {})['headers']
        print_result = requests.post(pdf_url,
                                     headers=pdf_headers,
                                     data=json.dumps(spec))
        try:
            if Config.get('print', {}).get('compute_toc_pages', False):
                with io.BytesIO() as pdf:
                    pdf.write(print_result.content)
                    pdf_reader = PdfFileReader(pdf)
                    x = []
                    for i in range(len(pdf_reader.getOutlines())):
                        x.append(pdf_reader.getOutlines()[i]['/Page']
                                 ['/StructParents'])
                    try:
                        true_nb_of_toc = min(x) - 1
                    except ValueError:
                        true_nb_of_toc = 1

                    if true_nb_of_toc != extract_as_dict['nbTocPages']:
                        log.warning(
                            'nbTocPages in result pdf: {} are not equal to the one predicted : {}, request new pdf'
                            .format(true_nb_of_toc,
                                    extract_as_dict['nbTocPages']))  # noqa
                        extract_as_dict['nbTocPages'] = true_nb_of_toc
                        print_result = requests.post(pdf_url,
                                                     headers=pdf_headers,
                                                     data=json.dumps(spec))
        except PdfReadError as e:
            err_msg = 'a problem occurred while generating the pdf file'
            log.error(err_msg + ': ' + str(e))
            raise HTTPInternalServerError(self._static_error_message)

        try:
            content = print_result.content
        except PdfReadError as e:
            err_msg = 'No contents from print result available!'
            log.error(err_msg + ': ' + str(e))
            raise HTTPInternalServerError(self._static_error_message)

        # Save printed file to the specified path.
        pdf_archive_path = print_config.get('pdf_archive_path', None)
        if pdf_archive_path is not None:
            self.archive_pdf_file(pdf_archive_path, content, extract_as_dict)

        response.status_code = print_result.status_code
        response.headers = print_result.headers
        if 'Transfer-Encoding' in response.headers:
            del response.headers['Transfer-Encoding']
        if 'Connection' in response.headers:
            del response.headers['Connection']
        return content
Beispiel #16
0
def test_get_outlines(src, outline_elements):
    reader = PdfFileReader(src)
    outlines = reader.getOutlines()
    assert len(outlines) == outline_elements
class PdfBookmark(object):
    """
    This class supports import/export PDF's
    bookmarks from/to a file.
    """
    def __init__(self, pdfPathName):
        self.pdfFileName = pdfPathName
        self._pdfStream = open(self.pdfFileName, 'rb')
        self._pdfReader = PdfFileReader(self._pdfStream)

        self.pageLabels = self._getPageLabels()
        self.outlines = self._pdfReader.getOutlines()
        self._addPageRatio(self.outlines, self.pageLabels)

    def getBookmark(self):
        """
        Retrieve this pdf's bookmark.
        """
        return self.outlines

    def exportBookmark(self, bookmarkFile):
        """
        Export bookmarks to a file.
        """
        stream = codecs.open(bookmarkFile, 'w', encoding='utf8')
        _writeBookmarkToStream(self.outlines, stream, 0)
        print("Export %s's bookmarks to %s finished!" %
              (self.pdfFileName, bookmarkFile))

    def importBookmark(self, bookmarkFile, saveAsPdfName=None):
        """
        Import the contents from a bookmark file and add these bookmarks
        to the current pdf file or another pdf file.
        """
        outlines = readBookmarkFromFile(bookmarkFile)
        output = PdfFileWriter()
        for i in range(0, self._pdfReader.getNumPages()):
            output.addPage(self._pdfReader.getPage(i))
        _writeOutlinesToPdf(outlines, output, None)

        if saveAsPdfName == None:
            saveAsPdfName = self.pdfFileName[0:-4] + '_bookmark.pdf'
        stream = open(saveAsPdfName, 'wb')
        output.write(stream)
        print("Add bookmarks in %s to %s finished!" %
              (bookmarkFile, saveAsPdfName))

    def _getPageLabels(self):
        """
        Get the map from IndirectObject id to real page number.
        """
        pageLabels = {}
        pages = list(self._pdfReader.pages)
        for i in range(0, len(pages)):
            page = pages[i]
            pageLabels[page.indirectRef.idnum] = i + 1
        return pageLabels

    def _addPageRatio(self, outlines, pageLabels):
        """
        Retrieves page ratio from Destination list.
        param outlines: Destination list
        param pageLabels: map from IndirectObject id to real page number
        """
        for i in range(0, len(outlines)):
            outline = outlines[i]
            if type(outline) == list:
                self._addPageRatio(outlines[i], pageLabels)
                continue
            elif not outline.has_key('/Page'):
                print("Error: outline has no key '/Page'")
                sys.exit(-1)
            pageHeight = outline['/Page']['/MediaBox'][-1]
            idIndirect = outline.page.idnum
            if pageLabels.has_key(idIndirect):
                pageNum = pageLabels[idIndirect]
            else:
                print(
                    'Error: Page corresponds to IndirectObject %d not Found' %
                    idIndirect)
                sys.exit(-1)
            if outline.has_key('/Top'):
                top = outline['/Top']
            else:
                top = pageHeight
            if outline.has_key('/Zoom'):
                zoom = outline['/Zoom']
            else:
                zoom = 1
            outline = dict(outline)
            try:
                outline['/Ratio'] = pageNum + (1 - top / zoom / pageHeight)
            except:
                pass
            outlines[i] = outline
Beispiel #18
0
    def __call__(self, value, system):
        """
        Implements a subclass of pyramid_oereb.lib.renderer.extract.json_.Renderer to create a print result
        out of a json. The json extract is reformatted to fit the structure of mapfish print.

        Args:
            value (tuple): A tuple containing the generated extract record and the params
                dictionary.
            system (dict): The available system properties.

        Returns:
            buffer: The pdf content as received from configured mapfish print instance url.
        """
        log.debug("Parameter webservice is {}".format(value[1]))

        if value[1].images:
            raise HTTPBadRequest('With image is not allowed in the print')

        self._request = self.get_request(system)

        # Create a lower case GET dict to be able to accept all cases of upper and lower case writing
        self._lowercase_GET_dict = dict(
            (k.lower(), v.lower()) for k, v in self._request.GET.iteritems())

        # If a language is specified in the request, use it. Otherwise, use the language from base class
        self._fallback_language = Config.get('default_language')
        if 'lang' in self._lowercase_GET_dict:
            self._language = self._lowercase_GET_dict.get('lang')

        # Based on extract record and webservice parameter, render the extract data as JSON
        extract_record = value[0]
        extract_as_dict = self._render(extract_record, value[1])
        feature_geometry = mapping(extract_record.real_estate.limit)
        pdf_to_join = set()

        if Config.get('print', {}).get('compute_toc_pages', False):
            extract_as_dict['nbTocPages'] = TocPages(
                extract_as_dict).getNbPages()
        else:
            extract_as_dict['nbTocPages'] = 1

        self.convert_to_printable_extract(extract_as_dict, feature_geometry,
                                          pdf_to_join)

        print_config = Config.get('print', {})

        extract_as_dict[
            'Display_RealEstate_SubunitOfLandRegister'] = print_config.get(
                'display_real_estate_subunit_of_land_register', True)

        extract_as_dict['Display_Certification'] = print_config.get(
            'display_certification', True)

        spec = {
            'layout': Config.get('print', {})['template_name'],
            'outputFormat': 'pdf',
            'lang': self._language,
            'attributes': extract_as_dict,
        }

        response = self.get_response(system)

        if self._request.GET.get('getspec', 'no') != 'no':
            response.headers[
                'Content-Type'] = 'application/json; charset=UTF-8'
            return json.dumps(spec, sort_keys=True, indent=4)
        pdf_url = urlparse.urljoin(
            Config.get('print', {})['base_url'] + '/', 'buildreport.pdf')
        pdf_headers = Config.get('print', {})['headers']
        print_result = requests.post(pdf_url,
                                     headers=pdf_headers,
                                     data=json.dumps(spec))
        if Config.get('print', {}).get('compute_toc_pages', False):
            with io.BytesIO() as pdf:
                pdf.write(print_result.content)
                pdf_reader = PdfFileReader(pdf)
                x = []
                for i in range(len(pdf_reader.getOutlines())):
                    x.append(
                        pdf_reader.getOutlines()[i]['/Page']['/StructParents'])
                try:
                    true_nb_of_toc = min(x) - 1
                except ValueError:
                    true_nb_of_toc = 1

                if true_nb_of_toc != extract_as_dict['nbTocPages']:
                    log.warning(
                        'nbTocPages in result pdf: {} are not equal to the one predicted : {}, request new pdf'
                        .format(true_nb_of_toc,
                                extract_as_dict['nbTocPages']))  # noqa
                    extract_as_dict['nbTocPages'] = true_nb_of_toc
                    print_result = requests.post(pdf_url,
                                                 headers=pdf_headers,
                                                 data=json.dumps(spec))

        if not extract_as_dict['isReduced'] and print_result.status_code == 200:
            main = tempfile.NamedTemporaryFile(suffix='.pdf')
            main.write(print_result.content)
            main.flush()
            cmd = ['pdftk', main.name]
            temp_files = [main]
            for url in pdf_to_join:
                result = requests.get(url)
                content_type = result.headers.get('content-type')
                log.debug("document url: " + url + " => content_type: " +
                          content_type)
                if content_type != 'application/pdf':
                    msg = "Skipped document inclusion (url: '{}') because content_type: '{}'"
                    log.warning(msg.format(url, content_type))
                    continue
                tmp_file = tempfile.NamedTemporaryFile(suffix='.pdf')
                tmp_file.write(result.content)
                tmp_file.flush()
                temp_files.append(tmp_file)
                cmd.append(tmp_file.name)
            out = tempfile.NamedTemporaryFile(suffix='.pdf')
            cmd += ['cat', 'output', out.name]
            sys.stdout.flush()
            time.sleep(0.1)
            subprocess.check_call(cmd)
            content = out.file.read()
        else:
            content = print_result.content

        # Save printed file to the specified path.
        pdf_archive_path = print_config.get('pdf_archive_path', None)
        if pdf_archive_path is not None:
            self.archive_pdf_file(pdf_archive_path, content, extract_as_dict)

        response.status_code = print_result.status_code
        response.headers = print_result.headers
        if 'Transfer-Encoding' in response.headers:
            del response.headers['Transfer-Encoding']
        if 'Connection' in response.headers:
            del response.headers['Connection']
        return content
Beispiel #19
0
        result = []

    if type(outlines) == list:
        for outline in outlines:
            result = bookmarks(outline, pg_id_num_map, result)

    elif type(outlines) == PyPDF2.pdf.Destination:
        result.append((pg_id_num_map[outlines.page.idnum]+1, outlines['/Title']))

    return result
    

PDF_IN = PdfFileReader(open(PDF, 'rb'))
pg_id_num_map = page_id_to_num(PDF_IN)

outlines = PDF_IN.getOutlines()
outlines = [item for item in outlines if not type(item) == list]
outlines = [item for item in outlines if not item['/Title'] in exclude]

bmrks = bookmarks(outlines, pg_id_num_map)
it = iter(bmrks[1:])

TOC = []

for x in bmrks:
    try:
        TOC.append( (x[0], (next(it)[0] - 1), x[1]) )
    except:
        pass
        
print TOC
Beispiel #20
0
from PyPDF2 import PdfFileReader, PdfFileWriter
import translate

readFile = 'pdf/wtf_trans.pdf'

pdf = PdfFileReader(open(readFile, "rb"))

print(pdf.getOutlines())


def merge_pdf(firstpdf,secondpdf,insertpage):
    # 创建一个用来合并文件的实例
    pdf_merger = PdfFileMerger()
    pdf_merger.append(firstpdf)
    pdf_merger.merge(insertpage, secondpdf)
    # # 添加书签
    # pdf_merger.addBookmark('这是一个书签', 1)
    pdf_merger.write('merge_pdf.pdf')

#
# def split_by_num(filename, nums, password=None):
filename = r'F:\研一下\量化投资资料\量化教材\Hands-On_Machine_Learning_for_Algorithmic_Trading.pdf'
pdf_reader = PdfFileReader(open(filename, mode='rb' ))
pages = pdf_reader.getNumPages()
outline = pdf_reader.getOutlines()
outlinchapter = []
outlinepage = [i+18 for i in [8,33,65,88,119,147,175,224,260,284,312,351,389,418,441,458]]
for o in outline:
    res = re.findall(r"'/Title': '(.*?)', '/Page': IndirectObject\((.*?), 0\)",str(o),re.S)
    if 'Chapter' in res[0][0]:
        outlinchapter.append(res[0][0])
#print(list(outlinedict[0].keys())[0],list(outlinedict[0].values())[0])
outlinedict =[{i[0]:i[1]} for i in zip(outlinchapter,outlinepage)]


for i in range(len(outlinedict)+1):
    pdf_writer = PdfFileWriter()
    split_pdf_name = list(outlinedict[i].keys())[0].replace(':','') + '.pdf'
    start = list(outlinedict[i].values())[0]
    end = list(outlinedict[i+1].values())[0]
Beispiel #22
0
            result.update(bookmark_dict(item))
        else:
            result[reader.getDestinationPageNumber(item)] = item.title
    return result



print("Enter path to File(Example:: C:/Bob/Documents/)\n make sure it ends with /:",end='')
pa=input()
print("Enter the PDF file name(Example:: Bob.pdf):",end='')
th=input()
path = pa+th
writer = PdfFileWriter()
reader = PdfFileReader(path)

BookMarks = bookmark_dict(reader.getOutlines())
Total_Number_pages = reader.getNumPages()

Bname = ""
###################### Cleaning Bookmarks
for i in BookMarks.keys():
    Bname = str(BookMarks[i])
    Bname = Bname.replace("b'","")
    Bname = Bname.replace(r"\r'","")
    Bname = Bname.replace("&","AND")
    BookMarks[i] = Bname

######################
j = 0
ListOfList = []
Total_Number_Pages = reader.getNumPages()
chapters = []


def flatten(A):
    rt = []
    for i in A:
        if isinstance(i, list):
            rt.extend(flatten(i))
        else:
            rt.append(i)
    return rt


file_stream = open(file_to_read, 'rb')
pdf_content = PdfFileReader(file_stream)
outlines = pdf_content.getOutlines()

for i, item in enumerate(outlines):
    if type(item) is generic.Destination and type(outlines[i + 1]) is list:
        title = item.title
        title = '_'.join(title.strip().replace('/', '_').split(' '))
        max_number_of_characters = 100
        if len(title) > max_number_of_characters:
            title = title[:max_number_of_characters]
        outlines[i + 1].insert(0, item)
        content = outlines[i + 1]
        chapters.append((title, content))

for chapter in chapters:

    subchapters = flatten(chapter[1])
Beispiel #24
0
    readFile)  # 或者这个方式:pdfFileReader = PdfFileReader(open(readFile, 'rb'))
# 获取 PDF 文件的文档信息
documentInfo = pdfFileReader.getDocumentInfo()
print('documentInfo = %s' % documentInfo)
# 获取页面布局
pageLayout = pdfFileReader.getPageLayout()
print('pageLayout = %s ' % pageLayout)

# 获取页模式
pageMode = pdfFileReader.getPageMode()
print('pageMode = %s' % pageMode)

xmpMetadata = pdfFileReader.getXmpMetadata()
print('xmpMetadata  = %s ' % xmpMetadata)

# 获取页面大纲
outLines = pdfFileReader.getOutlines()
print('outLine = %s' % outLines)

# 获取 pdf 文件页数
pageCount = pdfFileReader.getNumPages()

print('pageCount = %s' % pageCount)
for index in range(0, pageCount):
    # 返回指定页编号的 pageObject
    pageObj = pdfFileReader.getPage(index)
    print('index = %d , pageObj = %s' %
          (index, type(pageObj)))  # <class 'PyPDF2.pdf.PageObject'>
    # 获取 pageObject 在 PDF 文档中处于的页码
    pageNumber = pdfFileReader.getPageNumber(pageObj)
    print('pageNumber = %s ' % pageNumber)