def performOPFSourceUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary updates = {} for i in range(0, len(keylist)): updates[keylist[i]] = valuelist[i] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["link", "item", "reference", "site"]): if "href" in tag.attrs: href = tag["href"] if href.find(":") == -1: parts = href.split('#') ahref = unquoteurl(parts[0]) fragment = "" if len(parts) > 1: fragment = parts[1] oldtarget = buildBookPath(ahref, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = buildRelativePath(newbkpath, newtarget) if fragment != "": attribute_value = attribute_value + "#" + fragment attribute_value = quoteurl(attribute_value) tag["href"] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath, merged_bookpaths): data = _remove_xml_header(data) startdir = startingDir(ncx_bookpath) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') if parts is not None: ahref = unquoteurl(parts[0]) target_bookpath = buildBookPath(ahref, startdir) if target_bookpath in merged_bookpaths: attribute_value = buildRelativePath( ncx_bookpath, sink_bookpath) if len(parts) > 1 and parts[1] != "": attribute_value += "#" + parts[1] tag["src"] = quoteurl(attribute_value) newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary of xml_updates properly adjusted updates = OrderedDict() for i in range(0, len(keylist)): updates[ keylist[i] ] = valuelist[i] xml_empty_tags = ["page"] xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all(["page"]): for att in ["href"]: if att in tag.attrs : ref = tag[att] if ref.find(":") == -1 : parts = ref.split('#') apath = urldecodepart(parts[0]) fragment = "" if len(parts) > 1: fragment = urldecodepart(parts[1]) oldtarget = buildBookPath(apath, startingDir(oldbkpath)) newtarget = updates.get(oldtarget, oldtarget) attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget)) if fragment != "": attribute_value = attribute_value + "#" + urlencodepart(fragment) tag[att] = attribute_value newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist): data = _remove_xml_header(data) # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8 data = data.encode('utf-8') # rebuild serialized lookup dictionary id_dict = OrderedDict() for i in range(0, len(keylist)): id_dict[ keylist[i] ] = valuelist[i] startdir = startingDir(ncx_bookpath) xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags) soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder) for tag in soup.find_all("content"): if "src" in tag.attrs: src = tag["src"] if src.find(":") == -1: parts = src.split('#') apath = urldecodepart(parts[0]) # convert this path to its target bookpath target_bookpath = buildBookPath(apath, startdir) if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""): fragment_id = urldecodepart(parts[1]) if fragment_id in id_dict: target_bookpath = id_dict[fragment_id] attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath)) attribute_value = attribute_value + "#" + urlencodepart(fragment_id) tag["src"] = attribute_value; newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars=" ") return newdata
def __init__(self, opf_path, opf_bookpath, debug=False): self._debug = debug opf_path = pathof(opf_path) self.opfname = os.path.basename(opf_path) self.opf_bookpath = opf_bookpath self.opf_dir = startingDir(opf_bookpath) self.opf = None with open(opf_path, 'rb') as fp: self.opf = fp.read().decode('utf-8') self.opos = 0 self.package = None self.metadata_attr = None self.metadata = [] self.cover_id = None # let downstream invert any invertable dictionaries when needed self.manifest_id_to_href = {} self.manifest_id_to_bookpath = {} # create non-invertable dictionaries self.manifest_id_to_mime = {} self.manifest_id_to_properties = {} self.manifest_id_to_fallback = {} self.manifest_id_to_overlay = {} # spine and guide self.spine = [] self.spine_ppd = None self.guide = [] self.bindings = [] # determine folder structure self.group_folder = {} self.group_count = {} self.group_folder["epub"] = ['META-INF'] self.group_count["epub"] = [1] self.group_folder["opf"] = [self.opf_dir] self.group_count["opf"] = [1] # self.bookpaths = [] # self.bookpaths.append(self.opf_bookpath) self._parseData()
def get_startingdir(self, bookpath): bookpath = _unicodestr(bookpath) return startingDir(bookpath)
def _parseData(self): cnt = 0 for prefix, tname, tattr, tcontent in self._opf_tag_iter(): if self._debug: print(" Parsing OPF: ", prefix, tname, tattr, tcontent) # package if tname == "package": ver = tattr.pop("version", "2.0") uid = tattr.pop("unique-identifier", "bookid") self.package = (ver, uid, tattr) continue # metadata if tname == "metadata": self.metadata_attr = tattr continue if tname in ["meta", "link" ] or tname.startswith("dc:") and "metadata" in prefix: self.metadata.append((tname, tattr, tcontent)) if tattr.get("name", "") == "cover": self.cover_id = tattr.get("content", None) continue # manifest if tname == "item" and "manifest" in prefix: nid = "xid%03d" % cnt cnt += 1 id = tattr.pop("id", nid) href = tattr.pop("href", '') mtype = tattr.pop("media-type", '') if mtype == "text/html": mtype = "application/xhtml+xml" if mtype not in mime_group_map: print("****Opf_Parser Warning****: Unknown MediaType: ", mtype) href = unquoteurl(href) properties = tattr.pop("properties", None) fallback = tattr.pop("fallback", None) overlay = tattr.pop("media-overlay", None) # external resources are now allowed in the opf under epub3 # we can ignore fragments here as these are links to files self.manifest_id_to_href[id] = href bookpath = "" if href.find(":") == -1: bookpath = buildBookPath(href, self.opf_dir) self.manifest_id_to_bookpath[id] = bookpath self.manifest_id_to_mime[id] = mtype # self.bookpaths.append(bookpath) group = mime_group_map.get(mtype, '') if bookpath != "" and group != "": folderlst = self.group_folder.get(group, []) countlst = self.group_count.get(group, []) sdir = startingDir(bookpath) if sdir not in folderlst: folderlst.append(sdir) countlst.append(1) else: pos = folderlst.index(sdir) countlst[pos] = countlst[pos] + 1 self.group_folder[group] = folderlst self.group_count[group] = countlst self.manifest_id_to_properties[id] = properties self.manifest_id_to_fallback[id] = fallback self.manifest_id_to_overlay[id] = overlay continue # spine if tname == "spine": if tattr is not None: self.spine_ppd = tattr.get("page-progression-direction", None) continue if tname == "itemref" and "spine" in prefix: idref = tattr.pop("idref", "") linear = tattr.pop("linear", None) properties = tattr.pop("properties", None) self.spine.append((idref, linear, properties)) continue # guide if tname == "reference" and "guide" in prefix: type = tattr.pop("type", '') title = tattr.pop("title", '') href = unquoteurl(tattr.pop("href", '')) self.guide.append((type, title, href)) continue # bindings (stored but ignored for now) if tname in ["mediaType", "mediatype"] and "bindings" in prefix: mtype = tattr.pop("media-type", "") handler = tattr.pop("handler", "") self.bindings.append((mtype, handler)) continue # determine unique ShortPathName for each bookpath # start with filename and work back up the folders # spn = {} # dupset = set() # nameset = {} # lvl = 1 # for bkpath in self.bookpaths: # aname = build_short_name(bkpath, lvl) # spn[bkpath] = aname # if aname in nameset: # dupset.add(aname) # nameset[aname].append(bkpath) # else: # nameset[aname]=[bkpath] # # now work just through any to-do list of duplicates # until all duplicates are gone # # todolst = list(dupset) # while(todolst): # dupset = set() # lvl += 1 # for aname in todolst: # bklst = nameset[aname] # del nameset[aname] # for bkpath in bklst: # newname = build_short_name(bkpath, lvl) # spn[bkpath] = newname # if newname in nameset: # dupset.add(newname) # nameset[newname].append(bkpath) # else: # nameset[newname] = [bkpath] # todolst = list(dupset) # finally sort by number of files in dir to find default folders for each group dirlst = [] use_lower_case = False for group in self.group_folder.keys(): folders = self.group_folder[group] cnts = self.group_count[group] folders = [x for _, x in sorted(zip(cnts, folders), reverse=True)] self.group_folder[group] = folders if group in [ "Text", "Styles", "Images", "Audio", "Fonts", "Video", "Misc" ]: afolder = folders[0] if afolder.find(group.lower()) > -1: use_lower_case = True dirlst.append(folders[0]) # now back fill any missing values # commonbase will end with a / commonbase = longestCommonPath(dirlst) if commonbase == "/": commonbase = "" for group in ["Styles", "Images", "Audio", "Fonts", "Video", "Misc"]: folders = self.group_folder.get(group, []) gname = group if use_lower_case: gname = gname.lower() if not folders: folders = [commonbase + gname] self.group_folder[group] = folders
def parse_nav(qp, navdata, navbkpath, newdir): qp.setContent(navdata) toclist = [] pagelist = [] landmarks = [] lvl = 0 pgcnt = 0 maxlvl = -1 nav_type = None href = None title = "" play = 0 navdir = startingDir(navbkpath) for txt, tp, tname, ttype, tattr in qp.parse_iter(): if txt is not None: if ".a." in tp or tp.endswith(".a"): title = title + txt else: title = "" else: if tname == "nav": if ttype == "begin": nav_type = tattr.get("epub:type", None) if ttype == "end": nav_type = None continue if tname == "ol" and nav_type is not None and nav_type in ("toc","page-list","landmarks"): if ttype == "begin": lvl += 1 if nav_type == "toc": if lvl > maxlvl: maxlvl = lvl if ttype == "end": lvl -= 1 continue if tname == "a" and ttype == "begin": href = tattr.get("href", "") href = unquoteurl(href) if href.find(":") == -1: # first strip off any fragment fragment = "" if href.find("#") != -1: href, fragment = href.split("#") # find destination bookpath if href.startswith("./"): href=href[2:] if href == "": destbkpath = navbkpath else: destbkpath = buildBookPath(href, navdir) # create relative path to destbkpath from newdir href = relativePath(destbkpath, newdir) if fragment != "": href = href + "#" + fragment epubtype = tattr.get("epub:type", None) continue if tname == "a" and ttype == "end": if nav_type == "toc": play += 1 toclist.append((play, lvl, href, title)) elif nav_type == "page-list": pgcnt += 1 pagelist.append((pgcnt, href, title)) elif nav_type == "landmarks": if epubtype is not None: gtype = _epubtype_guide_map.get(epubtype, None) landmarks.append((gtype, href, title)) title = "" continue return toclist, pagelist, landmarks, maxlvl, pgcnt