Example #1
0
def performPageMapUpdates(data, newbkpath, oldbkpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary of xml_updates properly adjusted
    updates = OrderedDict()
    for i in range(0, len(keylist)):
        updates[ keylist[i] ] = valuelist[i]
    xml_empty_tags = ["page"]
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all(["page"]):
        for att in ["href"]:
            if att in tag.attrs :
                ref = tag[att]
                if ref.find(":") == -1 :
                    parts = ref.split('#')
                    apath = urldecodepart(parts[0])
                    fragment = ""
                    if len(parts) > 1:
                        fragment = urldecodepart(parts[1])
                    oldtarget = buildBookPath(apath, startingDir(oldbkpath))
                    newtarget = updates.get(oldtarget, oldtarget)
                    attribute_value = urlencodepart(buildRelativePath(newbkpath, newtarget))
                    if fragment != "":
                        attribute_value = attribute_value + "#" + urlencodepart(fragment)
                    tag[att] = attribute_value
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata
Example #2
0
def anchorNCXUpdates(data, ncx_bookpath, originating_bookpath, keylist, valuelist):
    data = _remove_xml_header(data)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    # rebuild serialized lookup dictionary
    id_dict = OrderedDict()
    for i in range(0, len(keylist)):
        id_dict[ keylist[i] ] = valuelist[i]
    startdir = startingDir(ncx_bookpath)
    xmlbuilder = LXMLTreeBuilderForXML(parser=None, empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data, features=None, from_encoding="utf-8", builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                apath = urldecodepart(parts[0])
                # convert this path to its target bookpath
                target_bookpath = buildBookPath(apath, startdir)
                if (parts is not None) and (len(parts) > 1) and (target_bookpath == originating_bookpath) and (parts[1] != ""):
                    fragment_id = urldecodepart(parts[1])
                    if fragment_id in id_dict:
                        target_bookpath = id_dict[fragment_id]
                        attribute_value = urlencodepart(buildRelativePath(ncx_bookpath, target_bookpath))
                        attribute_value = attribute_value + "#" + urlencodepart(fragment_id)
                        tag["src"] = attribute_value;
    newdata = soup.decodexml(indent_level=0, formatter='minimal', indent_chars="  ")
    return newdata
Example #3
0
def anchorNCXUpdatesAfterMerge(data, ncx_bookpath, sink_bookpath,
                               merged_bookpaths):
    data = _remove_xml_header(data)
    startdir = startingDir(ncx_bookpath)
    # lxml on a Mac does not seem to handle full unicode properly, so encode as utf-8
    data = data.encode('utf-8')
    xmlbuilder = LXMLTreeBuilderForXML(parser=None,
                                       empty_element_tags=ebook_xml_empty_tags)
    soup = BeautifulSoup(data,
                         features=None,
                         from_encoding="utf-8",
                         builder=xmlbuilder)
    for tag in soup.find_all("content"):
        if "src" in tag.attrs:
            src = tag["src"]
            if src.find(":") == -1:
                parts = src.split('#')
                if parts is not None:
                    apath = urldecodepart(parts[0])
                    target_bookpath = buildBookPath(apath, startdir)
                    if target_bookpath in merged_bookpaths:
                        attribute_value = urlencodepart(
                            buildRelativePath(ncx_bookpath, sink_bookpath))
                        if len(parts) > 1 and parts[1] != "":
                            fragment = urldecodepart(parts[1])
                            attribute_value += "#" + urlencodepart(parts[1])
                        tag["src"] = attribute_value
    newdata = soup.decodexml(indent_level=0,
                             formatter='minimal',
                             indent_chars="  ")
    return newdata
Example #4
0
 def deleteotherfile(self, book_href):
     id = _unicodestr(book_href)
     id = urldecodepart(id)
     if id is None:
         raise WrapperException('None is not a valid book hrefbook href')
     if id not in self.other and id in self.id_to_href:
         raise WrapperException('Incorrect interface routine - use deletefile')
     filepath = self.book_href_to_filepath.get(id, None)
     if filepath is None:
         raise WrapperException('Book href does not exist')
     if id in PROTECTED_FILES or id == self.opfbookpath:
         raise WrapperException('attempt to delete protected file')
     add_to_deleted = True
     # if file was added or modified delete file from outdir
     if id in self.added or id in self.modified:
         filepath = os.path.join(self.outdir, filepath)
         if os.path.exists(filepath) and os.path.isfile(filepath):
             os.remove(filepath)
         if id in self.added:
             self.added.remove(id)
             add_to_deleted = False
         if id in self.other:
             self.other.remove(id)
         if id in self.modified:
             del self.modified[id]
     if add_to_deleted:
         self.deleted.append(('other', id, book_href))
     del self.book_href_to_filepath[id]
Example #5
0
 def readotherfile(self, book_href):
     id = _unicodestr(book_href)
     id = urldecodepart(id)
     if id is None:
         raise WrapperException('None is not a valid book href')
     if id not in self.other and id in self.id_to_href:
         raise WrapperException('Incorrect interface routine - use readfile')
     # handle special case of trying to read the opf after it has been modified
     if id == self.opfbookpath:
         if id in self.modified:
             return self.build_opf()
     filepath = self.book_href_to_filepath.get(id, None)
     if filepath is None:
         raise WrapperException('Book href does not exist')
     basedir = self.ebook_root
     if id in self.added or id in self.modified:
         basedir = self.outdir
     filepath = os.path.join(basedir, filepath)
     if not os.path.exists(filepath):
         raise WrapperException('File Does Not Exist')
     basename = os.path.basename(filepath)
     ext = os.path.splitext(basename)[1]
     ext = ext.lower()
     mime = ext_mime_map.get(ext, "")
     data = b''
     with open(filepath, 'rb') as fp:
         data = fp.read()
     if mime in TEXT_MIMETYPES:
         data = _unicodestr(data)
     return data
Example #6
0
 def getmime(self, href):
     href = _unicodestr(href)
     href = urldecodepart(href)
     filename = os.path.basename(href)
     ext = os.path.splitext(filename)[1]
     ext = ext.lower()
     return ext_mime_map.get(ext, "")
Example #7
0
 def _parseData(self):
     cnt = 0
     for prefix, tname, tattr, tcontent in self._opf_tag_iter():
         # package
         if tname == "package":
             ver = tattr.pop("version", "2.0")
             uid = tattr.pop("unique-identifier", "bookid")
             if self.ns_remap:
                 if "xmlns:opf" in tattr:
                     tattr.pop("xmlns:opf")
                     tattr["xmlns"] = "http://www/idpf.org/2007/opf"
             self.package = (ver, uid, tattr)
             continue
         # metadata
         if tname == "metadata":
             if self.ns_remap:
                 if not "xmlns:opf" in tattr:
                     tattr["xmlns:opf"] = "http://www/idpf.org/2007/opf"
             self.metadata_attr = tattr
             continue
         if tname in ["meta", "link"
                      ] or tname.startswith("dc:") and "metadata" in prefix:
             self.metadata.append((tname, tcontent, tattr))
             continue
         # manifest
         if tname == "item" and "manifest" in prefix:
             nid = "xid%03d" % cnt
             cnt += 1
             id = tattr.pop("id", nid)
             # must keep all hrefs in encoded) form
             # if relative, then no fragments so decode and then encode for safety
             href = tattr.pop("href", "")
             if href.find(':') == -1:
                 href = urldecodepart(href)
                 href = urlencodepart(href)
             mtype = tattr.pop("media-type", "")
             self.manifest.append((id, href, mtype, tattr))
             continue
         # spine
         if tname == "spine":
             self.spine_attr = tattr
             continue
         if tname == "itemref" and "spine" in prefix:
             idref = tattr.pop("idref", "")
             self.spine.append((idref, tattr))
             continue
         # guide
         if tname == "reference" and "guide" in prefix:
             type = tattr.pop("type", "")
             title = tattr.pop("title", "")
             # must keep all hrefs in quoted (encoded) form
             href = tattr.pop("href", "")
             self.guide.append((type, title, href))
             continue
         # bindings
         if tname in ["mediaType", "mediatype"] and "bindings" in prefix:
             mtype = tattr.pop("media-type", "")
             handler = tattr.pop("handler", "")
             self.bindings.append((mtype, handler))
             continue
Example #8
0
 def setguide(self, new_guide):
     guide = []
     for (type, title, href) in new_guide:
         type = _unicodestr(type)
         title = _unicodestr(title)
         href = _unicodestr(href)
         if type not in _guide_types:
             type = "other." + type
         if title is None:
             title = 'title missing'
         thref = urldecodepart(href.split('#')[0])
         if thref not in self.href_to_id:
             raise WrapperException('guide href not in manifest')
         guide.append((type, title, href))
     self.guide = guide
     self.modified[self.opfbookpath] = 'file'
Example #9
0
 def addotherfile(self, book_href, data) :
     id = _unicodestr(book_href)
     id = urldecodepart(id)
     if id is None:
         raise WrapperException('None is not a valid book href')
     if id in self.other:
         raise WrapperException('Book href must be unique')
     desired_path = id.replace("/", os.sep)
     filepath = os.path.join(self.outdir, desired_path)
     if os.path.isfile(filepath):
         raise WrapperException('Desired path already exists')
     base = os.path.dirname(filepath)
     if not os.path.exists(base):
         os.makedirs(base)
     if isinstance(data, str):
         data = _utf8str(data)
     with open(filepath, 'wb')as fp:
         fp.write(data)
     self.other.append(id)
     self.added.append(id)
     self.book_href_to_filepath[id] = desired_path
Example #10
0
 def writeotherfile(self, book_href, data):
     id = _unicodestr(book_href)
     id = urldecodepart(id)
     if id is None:
         raise WrapperException('None is not a valid book href')
     if id not in self.other and id in self.id_to_href:
         raise WrapperException('Incorrect interface routine - use writefile')
     filepath = self.book_href_to_filepath.get(id, None)
     if filepath is None:
         raise WrapperException('Book href does not exist')
     if id in PROTECTED_FILES or id == self.opfbookpath:
         raise WrapperException('Attempt to modify protected file')
     filepath = os.path.join(self.outdir, filepath)
     base = os.path.dirname(filepath)
     if not os.path.exists(base):
         os.makedirs(base)
     if isinstance(data, str):
         data = _utf8str(data)
     with open(filepath, 'wb') as fp:
         fp.write(data)
     self.modified[id] = 'file'
Example #11
0
 def map_href_to_id(self, href, ow):
     href = _unicodestr(href)
     href = urldecodepart(href)
     return self.href_to_id.get(href, ow)
Example #12
0
 def build_bookpath(self, href, starting_dir):
     href = _unicodestr(href)
     href = urldecodepart(href)
     starting_dir = _unicodestr(starting_dir)
     return buildBookPath(href, starting_dir)
Example #13
0
def parse_nav(qp, navdata, navbkpath, newdir):
    qp.setContent(navdata)
    toclist = []
    pagelist = []
    landmarks = []
    lvl = 0
    pgcnt = 0
    maxlvl = -1
    nav_type = None
    href = None
    title = ""
    play = 0
    navdir = startingDir(navbkpath)

    for txt, tp, tname, ttype, tattr in qp.parse_iter():
        if txt is not None:
            if ".a." in tp or tp.endswith(".a"):
                title = title + txt
            else:
                title = ""
        else:
            if tname == "nav":
                if ttype == "begin":
                    nav_type = tattr.get("epub:type", None)
                if ttype == "end":
                    nav_type = None
                continue
            if tname == "ol" and nav_type is not None and nav_type in (
                    "toc", "page-list", "landmarks"):
                if ttype == "begin":
                    lvl += 1
                    if nav_type == "toc":
                        if lvl > maxlvl: maxlvl = lvl
                if ttype == "end": lvl -= 1
                continue
            if tname == "a" and ttype == "begin":
                # get the raw href (urlencoded)
                href = tattr.get("href", "")
                if href.find(":") == -1:
                    # first strip off any fragment
                    fragment = ""
                    if href.find("#") != -1:
                        href, fragment = href.split("#")
                    # find destination bookpath
                    href = urldecodepart(href)
                    fragment = urldecodepart(fragment)
                    if href.startswith("./"): href = href[2:]
                    if href == "":
                        destbkpath = navbkpath
                    else:
                        destbkpath = buildBookPath(href, navdir)
                    # create relative path to destbkpath from newdir
                    href = relativePath(destbkpath, newdir)
                    href = urlencodepart(href)
                    fragment = urlencodepart(fragment)
                    if fragment != "":
                        href = href + "#" + fragment
                epubtype = tattr.get("epub:type", None)
                continue
            if tname == "a" and ttype == "end":
                if nav_type == "toc":
                    play += 1
                    toclist.append((play, lvl, href, title))
                elif nav_type == "page-list":
                    pgcnt += 1
                    pagelist.append((pgcnt, href, title))
                elif nav_type == "landmarks":
                    if epubtype is not None:
                        gtype = _epubtype_guide_map.get(epubtype, None)
                        landmarks.append((gtype, href, title))
                title = ""
                continue

    return toclist, pagelist, landmarks, maxlvl, pgcnt