def addAttachment(self, name, data, subtype=None): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx#2Fxxx". E.g. for "text/xml": "/text#2Fxml" """ adapted_subtype = subtype if subtype: # If we receive the subtype in an 'unformated' (mimetype) format, we'll try to convert it to a pdf-valid one if REGEX_SUBTYPE_UNFORMATED.match(subtype): adapted_subtype = '/' + subtype.replace('/', '#2F') if not REGEX_SUBTYPE_FORMATED.match(adapted_subtype): # The subtype still does not match the correct format, so we will not add it to the document _logger.warning( "Attempt to add an attachment with the incorrect subtype '%s'. The subtype will be ignored.", subtype) adapted_subtype = '' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': adapted_subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def addAttachment(self, name, data, subtype=""): """ Add an attachment to the pdf. Supports adding multiple attachment, while respecting PDF/A rules. :param name: The name of the attachement :param data: The data of the attachement :param subtype: The mime-type of the attachement. This is required by PDF/A, but not essential otherwise. It should take the form of "/xxx%2Fxxx". E.g. for "text/xml": "/text%2Fxml" """ if subtype == 'application/xml': subtype = '/application#2Fxml' attachment = self._create_attachment_object({ 'filename': name, 'content': data, 'subtype': subtype, }) if self._root_object.get('/Names') and self._root_object['/Names'].get( '/EmbeddedFiles'): names_array = self._root_object["/Names"]["/EmbeddedFiles"][ "/Names"] names_array.extend([attachment.getObject()['/F'], attachment]) else: names_array = ArrayObject() names_array.extend([attachment.getObject()['/F'], attachment]) embedded_files_names_dictionary = DictionaryObject() embedded_files_names_dictionary.update( {NameObject("/Names"): names_array}) embedded_files_dictionary = DictionaryObject() embedded_files_dictionary.update({ NameObject("/EmbeddedFiles"): embedded_files_names_dictionary }) self._root_object.update( {NameObject("/Names"): embedded_files_dictionary}) if self._root_object.get('/AF'): attachment_array = self._root_object['/AF'] attachment_array.extend([attachment]) else: # Create a new object containing an array referencing embedded file # And reference this array in the root catalogue attachment_array = self._addObject(ArrayObject([attachment])) self._root_object.update({NameObject("/AF"): attachment_array})
def main(): print("Loading metadata and eText information...") with open("bookinfo.json", 'r') as bookInfoRequest: str_response = bookInfoRequest.read() bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] with open("pageinfo.json", 'r') as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read()) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] with open("pages.json", 'r') as file: downloadedData = json.loads(file.read())[0]["pdfPlayerPageInfoTOList"] def get_data(page_id): b = next((x['data'] for x in downloadedData if x['pageID'] == page_id), None) return bytearray(base64.standard_b64decode(b[len("data:application/pdf;base64,"):])) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdf_page_label_table = {} # urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) with open(os.path.join(pdfDownloadDir, "0000 - cover.pdf"), 'w+b') as ous: ous.write(get_data(pageInfo[0]['pageID'])) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) with open(savePath, 'w+b') as out: out.write(get_data(pdfPage['pageID'])) # urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Reading pages from pageinfo.json to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): page = PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0) os.remove(os.path.join(pdfDownloadDir, pdfFile)) # Save on memory a bit fileMerger.addPage(page) bookmarksExist = True # TODO: Bookmarks currently not supported with open("bookmarks.json", 'r') as bookmarkInfoRequest: try: bookmarkInfo = json.loads(bookmarkInfoRequest.read()) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['name'] # Name of the section pageNum = str(bookmark['linkvalue']['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark(bookmarkName, pdfPageTable[pageNum], parent) if 'basketentry' in bookmark: recursiveSetBookmarks(bookmark['basketentry'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['basketcollection']['basket']['basketentry']) else: print("Bookmarks don't exist for book") print("Fixing metadata...") # Hack to fix metadata and page numbers: pdf_page_label_table = [(v, k) for k, v in pdfPageTable.items()] pdf_page_label_table = sorted(pdf_page_label_table, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) last_mode = None last_prefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdf_page_label_table: curr_mode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): curr_mode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): curr_mode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if curr_mode != last_mode or prefix != last_prefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) last_mode = curr_mode last_prefix = prefix root_obj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() # fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) root_obj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("{}.pdf".format(bookInfo['title']).replace("/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def main(bookId): if bookId.startswith("http"): print("Trying to extract bookId from url") bookData = urllib.parse.parse_qs(bookId.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName: [itemValue] for itemName, itemValue in zip( *[iter(bookData["values"][0].split("::"))] * 2) } # Fix capitalization bookData["bookid"] = bookData["bookID"] bookId = bookData["bookid"][0] bookId = int(bookId) print( "Downloading book id {}. Please open an issue on GitHub if this book id is incorrect." .format(bookId)) print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookId) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userroleid=roletypeid, bookid=bookId, bookeditionid=bookInfo['bookEditionID']) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/", "") getPage = pagePath = pdfUrl.format(bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve( getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join( pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool( 40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage( PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=roletypeid, bookid=bookId, language=language, bookeditionid=bookInfo['bookEditionID'], scenarioid=1001) bookmarksExist = True with urllib.request.urlopen( hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: try: bookmarkInfo = json.loads( bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] except Exception as e: bookmarksExist = False def recursiveSetBookmarks(aDict, parent=None): if isinstance(aDict, dict): aDict = [aDict] for bookmark in aDict: # These are the main bookmarks under this parent (or the whole document if parent is None) bookmarkName = bookmark['n'] # Name of the section pageNum = str(bookmark['lv'] ['content']) # First page (in the pdf's format) latestBookmark = fileMerger.addBookmark( bookmarkName, pdfPageTable[pageNum], parent) if 'be' in bookmark: recursiveSetBookmarks(bookmark['be'], latestBookmark) if bookmarksExist: print("Adding bookmarks...") fileMerger.addBookmark( "Cover", 0) # Add a bookmark to the cover at the beginning recursiveSetBookmarks(bookmarkInfo['document'][0]['bc']['b']['be']) else: print("Bookmarks don't exist for ID {}".format(bookId)) print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v, k) for k, v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update( {NameObject("/P"): NameObject("({})".format(prefix))}) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({NameObject("/Nums"): ArrayObject(labels)}) rootObj.update({NameObject("/PageLabels"): pageLabels}) print("Writing PDF...") with open( "{} - {}.pdf".format(bookId, bookInfo['title']).replace( "/", "").replace(":", "_"), "wb") as outFile: fileMerger.write(outFile)
def main(eTextUrl): bookData = urllib.parse.parse_qs(eTextUrl.split("?")[-1]) if (bookData.get("values", None)) is not None: bookData = { itemName : [itemValue] for itemName, itemValue in zip(*[iter(bookData["values"][0].split("::"))]*2) } # A few fixes in terms of capitalization bookData["bookid"] = bookData["bookID"] bookData["userid"] = bookData["userID"] bookData["sessionid"] = bookData["sessionID"] # We'll default to the roletypeid for a student bookData["roletypeid"] = [roletypeid] # 3 for Instructor... the server doesn't care, though print("Downloading metadata and eText information...") bookInfoGetUrl = bookInfoUrl.format(bookData["bookid"][0]) #print(hsidUrl(bookInfoGetUrl)) with urllib.request.urlopen(hsidUrl(bookInfoGetUrl)) as bookInfoRequest: str_response = bookInfoRequest.read().decode('utf-8') bookInfo = json.loads(str_response) bookInfo = bookInfo[0]['userBookTOList'][0] pageInfoGetUrl = pageInfoUrl.format( userid=bookData['userid'][0], userroleid=bookData['roletypeid'][0], bookid=bookData['bookid'][0], bookeditionid=bookInfo['bookEditionID'], authkey=bookData['sessionid'][0], ) with urllib.request.urlopen(hsidUrl(pageInfoGetUrl)) as pageInfoRequest: pageInfo = json.loads(pageInfoRequest.read().decode('utf-8')) pageInfo = pageInfo[0]['pdfPlayerPageInfoTOList'] def getPageUrl(pdfPage, isCover="N"): pdfPage = pdfPage.replace("/assets/","") getPage = pagePath = pdfUrl.format( bookid=bookInfo['globalBookID'], pdfpage=pdfPage, iscover=isCover, authkey=bookData['sessionid'][0] ) return hsidUrl(getPage) with tempfile.TemporaryDirectory() as pdfDownloadDir: # Use a temporary directory to download all the pdf files to # First, download the cover file pdfPageTable = {} pdfPageLabelTable = {} urllib.request.urlretrieve(getPageUrl(bookInfo['pdfCoverArt'], isCover="Y"), os.path.join(pdfDownloadDir, "0000 - cover.pdf")) # Then, download all the individual pages for the e-book def download(pdfPage): pdfPageTable[pdfPage['bookPageNumber']] = pdfPage['pageOrder'] savePath = os.path.join(pdfDownloadDir, "{:04} - {}.pdf".format(pdfPage['pageOrder'], pdfPage['bookPageNumber'])) urllib.request.urlretrieve(getPageUrl(pdfPage['pdfPath']), savePath) threadPool = ThreadPool(40) # 40 threads should download a book fairly quickly print("Downloading pages to \"{}\"...".format(pdfDownloadDir)) threadPool.map(download, pageInfo) print("Assembling PDF...") # Begin to assemble the final PDF, first by adding all the pages fileMerger = PdfFileWriter() for pdfFile in sorted(os.listdir(pdfDownloadDir)): fileMerger.addPage(PdfFileReader(os.path.join(pdfDownloadDir, pdfFile)).getPage(0)) # And then add all the bookmarks to the final PDF bookmarkInfoGetUrl = bookmarkInfoUrl.format( userroleid=bookData['roletypeid'][0], bookid=bookData['bookid'][0], language=language, authkey=bookData['sessionid'][0], bookeditionid=bookInfo['bookEditionID'], scenarioid=bookData['scenario'][0], ) with urllib.request.urlopen(hsidUrl(bookmarkInfoGetUrl)) as bookmarkInfoRequest: bookmarkInfo = json.loads(bookmarkInfoRequest.read().decode('utf-8')) bookmarkInfo = bookmarkInfo[0]['basketsInfoTOList'][0] fileMerger.addBookmark("Cover", 0) # Add a bookmark to the cover at the beginning print("Fixing metadata...") # Hack to fix metadata and page numbers: pdfPageLabelTable = [(v,k) for k,v in pdfPageTable.items()] pdfPageLabelTable = sorted(pdfPageLabelTable, key=(lambda x: int(x[0]))) labels = ArrayObject([ NameObject(0), DictionaryObject({NameObject("/P"): NameObject("(cover)")}) ]) lastMode = None lastPrefix = "" # Now we check to see the ranges where we have roman numerals or arabic numerals # The following code is not ideal for this, so I'd appreciate a PR with a better solution for pageNumber, pageLabel in pdfPageLabelTable: currMode = None prefix = "" style = DictionaryObject() if arabicRegex.match(pageLabel): currMode = "arabic" prefix = arabicRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/D")}) elif romanRegex.match(pageLabel): currMode = "roman" prefix = romanRegex.match(pageLabel).group("prefix") style.update({NameObject("/S"): NameObject("/r")}) if currMode != lastMode or prefix != lastPrefix: if prefix: style.update({ NameObject("/P"): NameObject("({})".format(prefix)) }) labels.extend([ NumberObject(pageNumber), style, ]) lastMode = currMode lastPrefix = prefix rootObj = fileMerger._root_object # Todo: Fix the weird page numbering bug pageLabels = DictionaryObject() #fileMerger._addObject(pageLabels) pageLabels.update({ NameObject("/Nums"): ArrayObject(labels) }) rootObj.update({ NameObject("/PageLabels"): pageLabels }) print("Writing PDF...") with open("out.pdf", "wb") as outFile: fileMerger.write(outFile)