Beispiel #1
0
    def addAttachment(self, name, data, subtype=None):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml"
        """
        adapted_subtype = subtype
        if subtype:
            # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one
            if REGEX_SUBTYPE_UNFORMATED.match(subtype):
                adapted_subtype = '/' + subtype.replace('/', '#2F')

            if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype):
                # The subtype still does not match the correct format, so we will not add it to the document
                _logger.warning(
                    "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.",
                    subtype)
                adapted_subtype = ''

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': adapted_subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})
Beispiel #2
0
    def addAttachment(self, name, data, subtype=""):
        """
        Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules.
        :param name: The name of the attachement
        :param data: The data of the attachement
        :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise.
        It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml"
        """
        if subtype == 'application/xml':
            subtype = '/application#2Fxml'

        attachment = self._create_attachment_object({
            'filename': name,
            'content': data,
            'subtype': subtype,
        })
        if self._root_object.get('/Names') and self._root_object['/Names'].get(
                '/EmbeddedFiles'):
            names_array = self._root_object["/Names"]["/EmbeddedFiles"][
                "/Names"]
            names_array.extend([attachment.getObject()['/F'], attachment])
        else:
            names_array = ArrayObject()
            names_array.extend([attachment.getObject()['/F'], attachment])

            embedded_files_names_dictionary = DictionaryObject()
            embedded_files_names_dictionary.update(
                {NameObject("/Names"): names_array})
            embedded_files_dictionary = DictionaryObject()
            embedded_files_dictionary.update({
                NameObject("/EmbeddedFiles"):
                embedded_files_names_dictionary
            })
            self._root_object.update(
                {NameObject("/Names"): embedded_files_dictionary})

        if self._root_object.get('/AF'):
            attachment_array = self._root_object['/AF']
            attachment_array.extend([attachment])
        else:
            # Create a new object containing an array referencing embedded file
            # And reference this array in the root catalogue
            attachment_array = self._addObject(ArrayObject([attachment]))
            self._root_object.update({NameObject("/AF"): attachment_array})
def main():

    print("Loading metadata and eText information...")

    with open("bookinfo.json", 'r') as bookInfoRequest:
        str_response = bookInfoRequest.read()
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    with open("pageinfo.json", 'r') as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read())
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    with open("pages.json", 'r') as file:
        downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"]

    def get_data(page_id):
        b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None)
        return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):]))

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdf_page_label_table = {}

        # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous:
            ous.write(get_data(pageInfo[0]['pageID']))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir,
                                    "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            with open(savePath, 'w+b') as out:
                out.write(get_data(pdfPage['pageID']))
            # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40)  # 40 threads should download a book fairly quickly
        print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)
            os.remove(os.path.join(pdfDownloadDir, pdfFile))  # Save on memory a bit
            fileMerger.addPage(page)

        bookmarksExist = True

        # TODO: Bookmarks currently not supported
        with open("bookmarks.json", 'r') as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(bookmarkInfoRequest.read())
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['name']  # Name of the section
                pageNum = str(bookmark['linkvalue']['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent)

                if 'basketentry' in bookmark:
                    recursiveSetBookmarks(bookmark['basketentry'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry'])
        else:
            print("Bookmarks don't exist for book")
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()]
        pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        last_mode = None
        last_prefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdf_page_label_table:
            curr_mode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                curr_mode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                curr_mode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if curr_mode != last_mode or prefix != last_prefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                last_mode = curr_mode
                last_prefix = prefix
        root_obj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        # fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        root_obj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)
Beispiel #4
0
def main(bookId):
    if bookId.startswith("http"):
        print("Trying to extract bookId from url")
        bookData = urllib.parse.parse_qs(bookId.split("?")[-1])
        if (bookData.get("values", None)) is not None:
            bookData = {
                itemName: [itemValue]
                for itemName, itemValue in zip(
                    *[iter(bookData["values"][0].split("::"))] * 2)
            }
            # Fix capitalization
            bookData["bookid"] = bookData["bookID"]
        bookId = bookData["bookid"][0]

    bookId = int(bookId)
    print(
        "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect."
        .format(bookId))

    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookId)
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userroleid=roletypeid,
        bookid=bookId,
        bookeditionid=bookInfo['bookEditionID'])
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/", "")
        getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'],
                                           pdfpage=pdfPage,
                                           iscover=isCover)
        return hsidUrl(getPage)

    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(
            getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"),
            os.path.join(pdfDownloadDir, "0000 - cover.pdf"))

        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(
                pdfDownloadDir,
                "{:04} - {}.pdf".format(pdfPage['pageOrder'],
                                        pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']),
                                       savePath)

        threadPool = ThreadPool(
            40)  # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(
                PdfFileReader(os.path.join(pdfDownloadDir,
                                           pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=roletypeid,
            bookid=bookId,
            language=language,
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=1001)

        bookmarksExist = True

        with urllib.request.urlopen(
                hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            try:
                bookmarkInfo = json.loads(
                    bookmarkInfoRequest.read().decode('utf-8'))
                bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]
            except Exception as e:
                bookmarksExist = False

        def recursiveSetBookmarks(aDict, parent=None):
            if isinstance(aDict, dict):
                aDict = [aDict]
            for bookmark in aDict:
                # These are the main bookmarks under this parent (or the whole document if parent is None)
                bookmarkName = bookmark['n']  # Name of the section
                pageNum = str(bookmark['lv']
                              ['content'])  # First page (in the pdf's format)

                latestBookmark = fileMerger.addBookmark(
                    bookmarkName, pdfPageTable[pageNum], parent)

                if 'be' in bookmark:
                    recursiveSetBookmarks(bookmark['be'], latestBookmark)

        if bookmarksExist:
            print("Adding bookmarks...")
            fileMerger.addBookmark(
                "Cover", 0)  # Add a bookmark to the cover at the beginning
            recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be'])
        else:
            print("Bookmarks don't exist for ID {}".format(bookId))
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable,
                                   key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0),
            DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update(
                        {NameObject("/P"): NameObject("({})".format(prefix))})
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({NameObject("/Nums"): ArrayObject(labels)})
        rootObj.update({NameObject("/PageLabels"): pageLabels})

        print("Writing PDF...")
        with open(
                "{} - {}.pdf".format(bookId, bookInfo['title']).replace(
                    "/", "").replace(":", "_"), "wb") as outFile:
            fileMerger.write(outFile)
def main(eTextUrl):
    bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1])
    if (bookData.get("values", None)) is not None:
        bookData = {
            itemName : [itemValue] for itemName, itemValue in
            zip(*[iter(bookData["values"][0].split("::"))]*2)
        }
        # A few fixes in terms of capitalization
        bookData["bookid"] = bookData["bookID"]
        bookData["userid"] = bookData["userID"]
        bookData["sessionid"] = bookData["sessionID"]

        # We'll default to the roletypeid for a student
        bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though


    print("Downloading metadata and eText information...")

    bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0])
    #print(hsidUrl(bookInfoGetUrl))
    with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest:
        str_response = bookInfoRequest.read().decode('utf-8')
        bookInfo = json.loads(str_response)
        bookInfo = bookInfo[0]['userBookTOList'][0]

    pageInfoGetUrl = pageInfoUrl.format(
        userid=bookData['userid'][0],
        userroleid=bookData['roletypeid'][0],
        bookid=bookData['bookid'][0],
        bookeditionid=bookInfo['bookEditionID'],
        authkey=bookData['sessionid'][0],
        )
    with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest:
        pageInfo = json.loads(pageInfoRequest.read().decode('utf-8'))
        pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList']

    def getPageUrl(pdfPage, isCover="N"):
        pdfPage = pdfPage.replace("/assets/","")
        getPage = pagePath = pdfUrl.format(
            bookid=bookInfo['globalBookID'],
            pdfpage=pdfPage,
            iscover=isCover,
            authkey=bookData['sessionid'][0]
        )
        return hsidUrl(getPage)


    with tempfile.TemporaryDirectory() as pdfDownloadDir:
        # Use a temporary directory to download all the pdf files to
        # First, download the cover file
        pdfPageTable = {}

        pdfPageLabelTable = {}

        urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf"))
        # Then, download all the individual pages for the e-book
        def download(pdfPage):
            pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder']
            savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber']))
            urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath)

        threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly
        print("Downloading pages to \"{}\"...".format(pdfDownloadDir))
        threadPool.map(download, pageInfo)

        print("Assembling PDF...")

        # Begin to assemble the final PDF, first by adding all the pages
        fileMerger = PdfFileWriter()
        for pdfFile in sorted(os.listdir(pdfDownloadDir)):
            fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0))

        # And then add all the bookmarks to the final PDF
        bookmarkInfoGetUrl = bookmarkInfoUrl.format(
            userroleid=bookData['roletypeid'][0],
            bookid=bookData['bookid'][0],
            language=language,
            authkey=bookData['sessionid'][0],
            bookeditionid=bookInfo['bookEditionID'],
            scenarioid=bookData['scenario'][0],
            )
        with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest:
            bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8'))
            bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0]

        fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning
        print("Fixing metadata...")
        # Hack to fix metadata and page numbers:
        pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()]
        pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0])))
        labels = ArrayObject([
            NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")})
        ])
        lastMode = None
        lastPrefix = ""
        # Now we check to see the ranges where we have roman numerals or arabic numerals
        # The following code is not ideal for this, so I'd appreciate a PR with a better solution
        for pageNumber, pageLabel in pdfPageLabelTable:
            currMode = None
            prefix = ""
            style = DictionaryObject()
            if arabicRegex.match(pageLabel):
                currMode = "arabic"
                prefix = arabicRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/D")})
            elif romanRegex.match(pageLabel):
                currMode = "roman"
                prefix = romanRegex.match(pageLabel).group("prefix")
                style.update({NameObject("/S"): NameObject("/r")})
            if currMode != lastMode or prefix != lastPrefix:
                if prefix:
                    style.update({
                        NameObject("/P"): NameObject("({})".format(prefix))
                    })
                labels.extend([
                    NumberObject(pageNumber),
                    style,
                ])
                lastMode = currMode
                lastPrefix = prefix
        rootObj = fileMerger._root_object
        # Todo: Fix the weird page numbering bug
        pageLabels = DictionaryObject()
        #fileMerger._addObject(pageLabels)
        pageLabels.update({
            NameObject("/Nums"): ArrayObject(labels)
        })
        rootObj.update({
            NameObject("/PageLabels"): pageLabels
        })

        print("Writing PDF...")
        with open("out.pdf", "wb") as outFile:
            fileMerger.write(outFile)