Python ArrayObject.extend Examples

Programming Language: Python

Namespace/Package Name: PyPDF2.generic

Class/Type: ArrayObject

Method/Function: extend

Examples at hotexamples.com: 5

Python ArrayObject.extend - 5 examples found. These are the top rated real world Python examples of PyPDF2.generic.ArrayObject.extend extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ArrayObject(30)

append(5)

extend(5)

read_from_stream(1)

Example #1

Show file

File: pdf.py Project: mausvt/flectra

    def addAttachment(self, name, data, subtype=None):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml"
        """
        adapted_subtype = subtype
        if subtype:
            # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one
            if REGEX_SUBTYPE_UNFORMATED.match(subtype):
                adapted_subtype = '/' + subtype.replace('/', '#2F')

            if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype):
                # The subtype still does not match the correct format, so we will not add it to the document
                _logger.warning(
                    "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.",
                    subtype)
                adapted_subtype = ''

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': adapted_subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})

Example #2

Show file

    def addAttachment(self, name, data, subtype=""):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml"
        """
        if subtype == 'application/xml':
            subtype = '/application#2Fxml'

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})

Example #3

Show file

File: downloader.py Project: Ernstsen/PearsonEbookDownloader

def main():

    print("Loading metadata and eText information...")

    with open("bookinfo.json", 'r') as bookInfoRequest:
        str_response = bookInfoRequest.read()
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    with open("pageinfo.json", 'r') as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read())
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    with open("pages.json", 'r') as file:
        downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"]

    def get_data(page_id):
        b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None)
        return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):]))

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdf_page_label_table = {}

        # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous:
            ous.write(get_data(pageInfo[0]['pageID']))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir,
                                    "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            with open(savePath, 'w+b') as out:
                out.write(get_data(pdfPage['pageID']))
            # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40)  # 40 threads should download a book fairly quickly
        print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)
            os.remove(os.path.join(pdfDownloadDir, pdfFile))  # Save on memory a bit
            fileMerger.addPage(page)

        bookmarksExist = True

        # TODO: Bookmarks currently not supported
        with open("bookmarks.json", 'r') as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(bookmarkInfoRequest.read())
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['name']  # Name of the section
                pageNum = str(bookmark['linkvalue']['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent)

                if 'basketentry' in bookmark:
                    recursiveSetBookmarks(bookmark['basketentry'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry'])
        else:
            print("Bookmarks don't exist for book")
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()]
        pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        last_mode = None
        last_prefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdf_page_label_table:
            curr_mode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                curr_mode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                curr_mode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if curr_mode != last_mode or prefix != last_prefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                last_mode = curr_mode
                last_prefix = prefix
        root_obj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        # fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        root_obj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)

Example #4

Show file

def main(bookId):
    if bookId.startswith("http"):
        print("Trying to extract bookId from url")
        bookData = urllib.parse.parse_qs(bookId.split("?")[-1])
        if (bookData.get("values", None)) is not None:
            bookData = {
                itemName: [itemValue]
                for itemName, itemValue in zip(
                    *[iter(bookData["values"][0].split("::"))] * 2)
            }
            # Fix capitalization
            bookData["bookid"] = bookData["bookID"]
        bookId = bookData["bookid"][0]

    bookId = int(bookId)
    print(
        "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect."
        .format(bookId))

    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookId)
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userroleid=roletypeid,
        bookid=bookId,
        bookeditionid=bookInfo['bookEditionID'])
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/", "")
        getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'],
                                           pdfpage=pdfPage,
                                           iscover=isCover)
        return hsidUrl(getPage)

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(
            getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"),
            os.path.join(pdfDownloadDir, "0000 - cover.pdf"))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(
                pdfDownloadDir,
                "{:04} - {}.pdf".format(pdfPage['pageOrder'],
                                        pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']),
                                       savePath)

        threadPool = ThreadPool(
            40)  # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(
                PdfFileReader(os.path.join(pdfDownloadDir,
                                           pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=roletypeid,
            bookid=bookId,
            language=language,
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=1001)

        bookmarksExist = True

        with urllib.request.urlopen(
                hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(
                    bookmarkInfoRequest.read().decode('utf-8'))
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['n']  # Name of the section
                pageNum = str(bookmark['lv']
                              ['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(
                    bookmarkName, pdfPageTable[pageNum], parent)

                if 'be' in bookmark:
                    recursiveSetBookmarks(bookmark['be'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark(
                "Cover", 0)  # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be'])
        else:
            print("Bookmarks don't exist for ID {}".format(bookId))
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable,
                                   key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0),
            DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update(
                        {NameObject("/P"): NameObject("({})".format(prefix))})
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({NameObject("/Nums"): ArrayObject(labels)})
        rootObj.update({NameObject("/PageLabels"): pageLabels})

        print("Writing PDF...")
        with open(
                "{} - {}.pdf".format(bookId, bookInfo['title']).replace(
                    "/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)

Example #5

Show file

File: downloader.py Project: nimayousefi/PearsonEbookDownloader

def main(eTextUrl):
    bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1])
    if (bookData.get("values", None)) is not None:
        bookData = {
            itemName : [itemValue] for itemName, itemValue in
            zip(*[iter(bookData["values"][0].split("::"))]*2)
        }
        # A few fixes in terms of capitalization
        bookData["bookid"] = bookData["bookID"]
        bookData["userid"] = bookData["userID"]
        bookData["sessionid"] = bookData["sessionID"]

        # We'll default to the roletypeid for a student
        bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though


    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0])
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userid=bookData['userid'][0],
        userroleid=bookData['roletypeid'][0],
        bookid=bookData['bookid'][0],
        bookeditionid=bookInfo['bookEditionID'],
        authkey=bookData['sessionid'][0],
        )
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/","")
        getPage = pagePath = pdfUrl.format(
            bookid=bookInfo['globalBookID'],
            pdfpage=pdfPage,
            iscover=isCover,
            authkey=bookData['sessionid'][0]
        )
        return hsidUrl(getPage)


    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=bookData['roletypeid'][0],
            bookid=bookData['bookid'][0],
            language=language,
            authkey=bookData['sessionid'][0],
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=bookData['scenario'][0],
            )
        with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8'))
            bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]

        fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        rootObj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("out.pdf", "wb") as outFile:
            fileMerger.write(outFile)