def anchorNCXUpdates(data, originating_filename, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = {} for i in range(0, len(keylist)): id_dict[keylist[i]] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (len(parts) > 1) and ( parts[0] == original_filename_with_relative_path) and ( parts[1] != ""): fragment_id = parts[1] if fragment_id in id_dict: attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl( id_dict[fragment_id]) + "#" + fragment_id tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, currentdir, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = {} for i in range(0, len(keylist)): updates[ keylist[i] ] = "../" + valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath, merged_bookpaths): data = _remove_xml_header(data) startdir = startingDir(ncx_bookpath) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if parts is not None: ahref = unquoteurl(parts[0]) target_bookpath = buildBookPath(ahref, startdir) if target_bookpath in merged_bookpaths: attribute_value = buildRelativePath( ncx_bookpath, sink_bookpath) if len(parts) > 1 and parts[1] != "": attribute_value += "#" + parts[1] tag["src"] = quoteurl(attribute_value) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performOPFSourceUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[keylist[i]] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["link", "item", "reference", "site"]): if "href" in tag.attrs: href = tag["href"] if href.find(":") == -1: parts = href.split('#') ahref = unquoteurl(parts[0]) fragment = "" if len(parts) > 1: fragment = parts[1] oldtarget = buildBookPath(ahref, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = buildRelativePath(newbkpath, newtarget) if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["href"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdatesAfterMerge(data, sink_filename, merged_names): data = _remove_xml_header(data) # build list of urls to rreplace namelist = [] for fn in merged_names: namelist.append(TEXT_FOLDER_NAME + "/" + fn) # and build url of file to replace them with sink_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + sink_filename # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (parts[0] in namelist): attribute_value = sink_filename_with_relative_path if len(parts) > 1 and parts[1] != "": attribute_value += "#" + parts[1] tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def repairXML(data, mtype="", indent_chars=" "): newdata = _remove_xml_header(data) # if well-formed - don't mess with it if _well_formed(newdata): return data newdata = _make_it_sane(newdata) if not _well_formed(newdata): newdata = _reformat(newdata) if mtype == "application/oebps-package+xml": newdata = newdata.decode('utf-8') newdata = Opf_Parser(newdata).rebuild_opfxml() # lxml requires utf-8 on Mac, won't work with unicode if isinstance(newdata, str): newdata = newdata.encode('utf-8') voidtags = get_void_tags(mtype) xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags) soup = BeautifulSoup(newdata, features=None, from_encoding="utf-8", builder=xmlbuilder) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars) return newdata
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = OrderedDict() for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] startdir = startingDir(ncx_bookpath) xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') apath = urldecodepart(parts[0]) # convert this path to its target bookpath target_bookpath = buildBookPath(apath, startdir) if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""): fragment_id = urldecodepart(parts[1]) if fragment_id in id_dict: target_bookpath = id_dict[fragment_id] attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath)) attribute_value = attribute_value + "#" + urlencodepart(fragment_id) tag["src"] = attribute_value; newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = OrderedDict() for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') apath = urldecodepart(parts[0]) fragment = "" if len(parts) > 1: fragment = urldecodepart(parts[1]) oldtarget = buildBookPath(apath, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget)) if fragment != "": attribute_value = attribute_value + "#" + urlencodepart(fragment) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performNCXSourceUpdates(data, currentdir, keylist, valuelist): # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars=" "): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars) return newdata
def repairXML(data, mtype="", indent_chars=" "): data = _remove_xml_header(data) data = _make_it_sane(data) voidtags = get_void_tags(mtype) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=voidtags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars) return newdata
def anchorNCXUpdates(data, originating_filename, keylist, valuelist): # rebuild serialized lookup dictionary id_dict = {} for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) original_filename_with_relative_path = TEXT_FOLDER_NAME + "/" + originating_filename for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if (parts is not None) and (len(parts) > 1) and (parts[0] == original_filename_with_relative_path) and (parts[1] != ""): fragment_id = parts[1] if fragment_id in id_dict: attribute_value = TEXT_FOLDER_NAME + "/" + quoteurl(id_dict[fragment_id]) + "#" + fragment_id tag["src"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, currentdir, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = {} for i in range(0, len(keylist)): updates[keylist[i]] = "../" + valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs: ref = tag[att] if ref.find(":") == -1: parts = ref.split('#') url = parts[0] fragment = "" if len(parts) > 1: fragment = parts[1] bookrelpath = os.path.join(currentdir, unquoteurl(url)) bookrelpath = os.path.normpath(bookrelpath) bookrelpath = bookrelpath.replace(os.sep, "/") if bookrelpath in updates: attribute_value = updates[bookrelpath] if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def repairXML(data, self_closing_tags=ebook_xml_empty_tags, indent_chars=" "): xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=self_closing_tags) soup = BeautifulSoup(data, features=None, builder=xmlbuilder) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=indent_chars) return newdata