def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath, merged_bookpaths): data = _remove_xml_header(data) startdir = startingDir(ncx_bookpath) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if parts is not None: ahref = unquoteurl(parts[0]) target_bookpath = buildBookPath(ahref, startdir) if target_bookpath in merged_bookpaths: attribute_value = buildRelativePath( ncx_bookpath, sink_bookpath) if len(parts) > 1 and parts[1] != "": attribute_value += "#" + parts[1] tag["src"] = quoteurl(attribute_value) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdates(data, originating_filename, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = {} for i in range(0, len(keylist)): id_dict[keylist[i]] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (len(parts) > 1) and ( parts[0] == original_filename_with_relative_path) and ( parts[1] != ""): fragment_id = parts[1] if fragment_id in id_dict: attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl( id_dict[fragment_id]) + "#" + fragment_id tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, currentdir, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = {} for i in range(0, len(keylist)): updates[ keylist[i] ] = "../" + valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performOPFSourceUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[keylist[i]] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["link", "item", "reference", "site"]): if "href" in tag.attrs: href = tag["href"] if href.find(":") == -1: parts = href.split('#') ahref = unquoteurl(parts[0]) fragment = "" if len(parts) > 1: fragment = parts[1] oldtarget = buildBookPath(ahref, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = buildRelativePath(newbkpath, newtarget) if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["href"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdatesAfterMerge(data, sink_filename, merged_names): data = _remove_xml_header(data) # build list of urls to rreplace namelist = [] for fn in merged_names: namelist.append(TEXT_FOLDER_NAME + "/" + fn) # and build url of file to replace them with sink_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + sink_filename # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (parts[0] in namelist): attribute_value = sink_filename_with_relative_path if len(parts) > 1 and parts[1] != "": attribute_value += "#" + parts[1] tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = OrderedDict() for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') apath = urldecodepart(parts[0]) fragment = "" if len(parts) > 1: fragment = urldecodepart(parts[1]) oldtarget = buildBookPath(apath, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget)) if fragment != "": attribute_value = attribute_value + "#" + urlencodepart(fragment) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = OrderedDict() for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] startdir = startingDir(ncx_bookpath) xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') apath = urldecodepart(parts[0]) # convert this path to its target bookpath target_bookpath = buildBookPath(apath, startdir) if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""): fragment_id = urldecodepart(parts[1]) if fragment_id in id_dict: target_bookpath = id_dict[fragment_id] attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath)) attribute_value = attribute_value + "#" + urlencodepart(fragment_id) tag["src"] = attribute_value; newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performNCXSourceUpdates(data, currentdir, keylist, valuelist): # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdates(data, originating_filename, keylist, valuelist): # rebuild serialized lookup dictionary id_dict = {} for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (len(parts) > 1) and (parts[0] == original_filename_with_relative_path) and (parts[1] != ""): fragment_id = parts[1] if fragment_id in id_dict: attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl(id_dict[fragment_id]) + "#" + fragment_id tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, currentdir, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = {} for i in range(0, len(keylist)): updates[keylist[i]] = "../" + valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs: ref = tag[att] if ref.find(":") == -1: parts = ref.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def run(bk): # get epub version number if bk.launcher_version() >= 20160102: epubversion = bk.epub_version() else: epubversion = BeautifulSoup(bk.get_opf(), 'lxml').find('package')['version'] # get preferences prefs = bk.getPrefs() if prefs == {}: prefs['tag'] = 'span' prefs['attribute'] = 'epub:type' prefs['value'] = 'pagebreak' bk.savePrefs(prefs) prefs = bk.getPrefs() tag = prefs['tag'] attribute = prefs['attribute'] value = prefs['value'] # get nav doc and toc.ncx ids nav_id = ncx_id = None ncx_id = bk.gettocid() if epubversion.startswith('3'): opf_soup = BeautifulSoup(bk.get_opf(), 'lxml') if opf_soup.find('item', {'properties' : 'nav'}) is not None: nav_id = opf_soup.find('item', {'properties' : 'nav'})['id'] else: print('Nav document ID not found!') ncx_pagelist = '\n <pageList>\n <navLabel>\n <text>Pages</text>\n </navLabel>' nav_pagelist = ' <nav epub:type="page-list" id="page-list">\n <ol>\n' page_targets = 0 substitutePageNum(bk) # get all html files page_dic = {} errors = 0 for (html_id, href) in bk.text_iter(): html = bk.readfile(html_id) # load html code into BeautifulSoup soup = BeautifulSoup(html, 'html.parser') # find pagebreaks page_numbers = soup.find_all(tag, {attribute : value}) if not page_numbers: print('\nNo page number targets found in ' + os.path.basename(href)) else: page_targets += len(page_numbers) print('\n' + str(len(page_numbers)) + ' page number targets found in ' + os.path.basename(href)) # add pagelist entries to pagelist for page_number in page_numbers: # title has priority over string if page_number.has_attr('title'): title = page_number['title'] else: title = page_number.contents[0] # generate id, if necessary if not page_number.has_attr('id'): id = 'page' + title id = page_number['id'] # check for duplicate titles/ids if title not in page_dic: page_dic[title] = os.path.basename(href + '#' + id) else: errors += 1 page_dic[title] += ' / ' + os.path.basename(href + '#' + id) print('ERROR: duplicate page number found:', title, page_dic[title]) # epub2 ncx_pagelist += '''\n <pageTarget id="{}" type="normal" value="{}"> <navLabel> <text>{}</text> </navLabel> <content src="{}"/> </pageTarget>'''.format(id, title, title, href + '#' + id) # epub3 if nav_id: nav_pagelist += ' <li>\n <a href="{}">{}</a>\n </li>\n'.format('../' + href + '#' + id, title) if errors != 0: print('Plugin aborted because of {} duplicate page number(s).'.format(str(errors))) return -1 # add/replace NCX pagelist section if page_targets: ncx_pagelist += '\n </pageList>' if ncx_id: # get ncx contents ncx = bk.readfile(ncx_id) # delete existing pagelist ncx = re.sub('\s*\<pageList[^>]*\>.+?\<\/pageList\>\s*', '', ncx, flags = re.DOTALL) # add new pagelist ncx = ncx.replace('</ncx>', ncx_pagelist + '\n</ncx>') # update ncx file bk.writefile(ncx_id, ncx) print('\n' + str(page_targets) + ' page number targets found.\nNCX file updated. ') else: print('\nNCX file couldn\'t be found and updated.') else: print('\nNo page number targets found.\nNCX file not updated') # add/replace NAV pagelist section if nav_id: nav_pagelist += ' </ol>\n </nav>' new_pagelist = BeautifulSoup(nav_pagelist, 'html.parser') # get nav contents nav = bk.readfile(nav_id) nav_soup = BeautifulSoup(nav, 'html.parser') orig_nav_soup = str(nav_soup) old_page_list = nav_soup.find('nav', {'epub:type' : 'page-list'}) if old_page_list is not None: old_page_list.replace_with(new_pagelist) #print('Existing page-list updated.') else: nav_soup.body.insert(2, new_pagelist) #print('New page-list section added.') # update nav if str(nav_soup) != orig_nav_soup: try: bk.writefile(nav_id, str(nav_soup.prettyprint_xhtml(indent_level=0, eventual_encoding="utf-8", formatter="minimal", indent_chars=" "))) except: bk.writefile(nav_id, str(nav_soup)) print('NAV file updated.') else: print('NAV NOT file updated.') print('\nPlease click OK to close the Plugin Runner window.') return 0